From TO/JS - This section sets some flags that can be tweaked to re-run the analysis with different filtering parameters:
RUN_EXAMPLE: allows discarding alleles that have a
frequency below a threshold (defaults to FALSE),
WITHIN_INDIVIDUAL_ALLELE_FREQ_THR: allows discarding
alleles that have a frequency below a threshold (defaults to 0),
BENCHMARK_MARKERS: list of markers to include in the
analysis (defaults to the full list established by Jason, could be
reduced down to 3 for memory optimization),
MAX_MOI_TO_INCLUDE: allows discarding participant data when
they exhibit very complex infection patterns, counted as the total
number of clones across all infection events (defaults to 8),
PRIOR_3RS: a named vector with prior probabilities of
recrudescence (C), relapse (L) or reinfection (I) (defaults to uniform,
i.e. 1/3 each, but tweaked to minimize recrudescence in the present
study).
## RUN_EXAMPLE
## Boolean flag to enable/disable running the example Pv3Rs posterior
## computation on a single episode sampled at random from the dataset.
## DEFAULT: FALSE
RUN_EXAMPLE <- TRUE
## WITHIN_INDIVIDUAL_ALLELE_FREQ_THR
## Minimum threshold above which an allele is preserved in individual
## haplotype data to be preserved when reconstructing infection
## history. Note that this will *not* discard these alleles from the
## population-level allele frequency that is derived from the initial
## visit, but only discard that allele from individual observations
## when it is 'too rare to be exploited'.
## DEFAULT: 0
WITHIN_INDIVIDUAL_ALLELE_FREQ_THR <- 0
## BENCHMARK_MARKERS
## Vector of character string with names matching those from the
## markers of interest for amplicon sequencing. Only the markers
## listed in that vector will be preserved in individual-level data
## and identity of infection will be solely based on the alleles
## observed for these markers.
## The markers are listed by importance as discovered by Jason.
## DEFAULT: all markers (could be suboptimal/too memory-consuming?)
BENCHMARK_MARKERS <- c(
"Chr05",
"Chr07",
"Chr09",
"Chr10",
"Chr08",
"Chr13",
"Chr11",
"Chr03",
"Chr01",
"Chr02",
"Chr14"
)
## MAX_MOI_TO_INCLUDE
## As per Aimee's guidelines:
## We do not recommend running compute_posterior() for data
## whose total genotype count exceeds eight, where the total
## genotype count is the sum of per-episode maximum per-marker
## allele counts.
## The MAX_MOI_TO_INCLUDE expects an integer that will discard all
## individuals having a summed MOI > 8 across all recorded episodes.
## DEFAULT: 8
MAX_MOI_TO_INCLUDE <- 8
## PRIORS_3RS
## A vector of probabilities, summing to 1, corresponding to
## the probability of each stage for the 3Rs for Pv episodes.
## The vector order is re(C)rudescence, re(L)apse, re(I)nfection.
## In this clinical trial, we assume recrudescence is possible so,
## we use the default priors
## DEFAULT: c("C" = 1/3, "L" = 1/3, "I" = 1/3)
PRIOR_3RS <- c("C" = 1/3, "L" = 1/3, "I" = 1/3)
# Note: the below probabilities were used for SeroTAT study because it does not involve treatment at baseline
# PRIOR_3RS <- c("C" = 0.10, "L" = 0.45, "I" = 0.45)
Read in data from Solomon Islands:
sols <- read.csv(here("data/final", "sols_raw_data.csv"))
# head(sols)
# names(sols)
sols %>%
group_by(marker_id) %>%
summarise(unique_haplotypes = n_distinct(Haplotype))
## # A tibble: 11 × 2
## marker_id unique_haplotypes
## <chr> <int>
## 1 Chr01 4
## 2 Chr02 10
## 3 Chr03 6
## 4 Chr05 13
## 5 Chr07 9
## 6 Chr08 2
## 7 Chr09 7
## 8 Chr10 6
## 9 Chr11 3
## 10 Chr13 5
## 11 Chr14 6
sols %>% count(marker_id, Haplotype)
## marker_id Haplotype n
## 1 Chr01 Chr01-1 76
## 2 Chr01 Chr01-2 48
## 3 Chr01 Chr01-3 18
## 4 Chr01 Chr01-4 5
## 5 Chr02 Chr02-1 27
## 6 Chr02 Chr02-11 3
## 7 Chr02 Chr02-2 25
## 8 Chr02 Chr02-3 28
## 9 Chr02 Chr02-4 29
## 10 Chr02 Chr02-5 25
## 11 Chr02 Chr02-6 13
## 12 Chr02 Chr02-7 10
## 13 Chr02 Chr02-8 6
## 14 Chr02 Chr02-9 4
## 15 Chr03 Chr03-1 84
## 16 Chr03 Chr03-2 36
## 17 Chr03 Chr03-3 9
## 18 Chr03 Chr03-4 18
## 19 Chr03 Chr03-5 8
## 20 Chr03 Chr03-6 2
## 21 Chr05 Chr05-1 26
## 22 Chr05 Chr05-10 2
## 23 Chr05 Chr05-11 4
## 24 Chr05 Chr05-12 2
## 25 Chr05 Chr05-13 3
## 26 Chr05 Chr05-2 8
## 27 Chr05 Chr05-3 17
## 28 Chr05 Chr05-4 16
## 29 Chr05 Chr05-5 16
## 30 Chr05 Chr05-6 13
## 31 Chr05 Chr05-7 13
## 32 Chr05 Chr05-8 13
## 33 Chr05 Chr05-9 5
## 34 Chr07 Chr07-1 57
## 35 Chr07 Chr07-10 3
## 36 Chr07 Chr07-11 2
## 37 Chr07 Chr07-2 39
## 38 Chr07 Chr07-3 33
## 39 Chr07 Chr07-4 18
## 40 Chr07 Chr07-5 11
## 41 Chr07 Chr07-6 8
## 42 Chr07 Chr07-9 3
## 43 Chr08 Chr08-1 108
## 44 Chr08 Chr08-2 23
## 45 Chr09 Chr09-1 80
## 46 Chr09 Chr09-2 22
## 47 Chr09 Chr09-3 34
## 48 Chr09 Chr09-4 18
## 49 Chr09 Chr09-5 9
## 50 Chr09 Chr09-7 2
## 51 Chr09 Chr09-9 2
## 52 Chr10 Chr10-1 66
## 53 Chr10 Chr10-2 52
## 54 Chr10 Chr10-3 16
## 55 Chr10 Chr10-4 11
## 56 Chr10 Chr10-5 6
## 57 Chr10 Chr10-6 5
## 58 Chr11 Chr11-1 78
## 59 Chr11 Chr11-2 39
## 60 Chr11 Chr11-3 33
## 61 Chr13 Chr13-1 40
## 62 Chr13 Chr13-2 55
## 63 Chr13 Chr13-3 45
## 64 Chr13 Chr13-4 5
## 65 Chr13 Chr13-5 4
## 66 Chr14 Chr14-1 51
## 67 Chr14 Chr14-2 46
## 68 Chr14 Chr14-3 25
## 69 Chr14 Chr14-4 21
## 70 Chr14 Chr14-5 16
## 71 Chr14 Chr14-6 9
sols %>%
group_by(sample, marker_id) %>%
summarise(n_haplotypes = n(), .groups = 'drop') %>%
group_by(sample) %>%
summarise(max_moi = max(n_haplotypes))
## # A tibble: 135 × 2
## sample max_moi
## <chr> <int>
## 1 AR-002-0 1
## 2 AR-006-0 2
## 3 AR-007-0 1
## 4 AR-008-0 2
## 5 AR-009-0 2
## 6 AR-015-0 2
## 7 AR-023-0 2
## 8 AR-024-0 1
## 9 AR-024-112 2
## 10 AR-024-56 2
## # ℹ 125 more rows
Participants with more than one episode for inference (n=41 participants, n=99 Pv isolates):
sols %>%
select(info, episodes) %>%
distinct() %>%
arrange(info) %>%
filter(episodes>1)
## info episodes
## 1 AR-024 5
## 2 AR-067 2
## 3 AR-081 3
## 4 AR-098 2
## 5 AR-110 2
## 6 AR-133 3
## 7 AR-140 3
## 8 AR-142 4
## 9 AR-160 2
## 10 AR-173 2
## 11 AR-176 3
## 12 AR-187 4
## 13 AR-192 2
## 14 AR-214 2
## 15 AR-221 2
## 16 AR-234 2
## 17 AR-237 2
## 18 AR-246 2
## 19 AR-248 2
## 20 AR-250 2
## 21 AR-258 2
## 22 AR-274 2
## 23 AR-288 2
## 24 AR-300 2
## 25 AR-302 2
## 26 AR-304 2
## 27 AR-306 3
## 28 AR-322 2
## 29 AR-325 2
## 30 AR-326 2
## 31 AR-328 3
## 32 AR-341 2
## 33 AR-343 2
## 34 AR-345 2
## 35 AR-346 2
## 36 AR-348 2
## 37 AR-355 2
## 38 AR-359 6
## 39 AR-366 2
## 40 AR-382 2
## 41 AR-383 2
ids_recurrent <- sols %>% clean_names() %>% filter(episodes > 1) %>% distinct(info)
sols_recurrent <- sols %>% clean_names() %>%filter(episodes > 1)
# setdiff(ids_recurrent$info, sols_recurrent$info)
# setdiff(sols_recurrent$info, ids_recurrent$info)
# sols_recurrent %>% distinct(sample) %>% arrange(sample) # 99 isolates
sols_recurrent %>%
distinct(sample) %>%
separate(sample, into = c("sample", "episode_day"), sep = "-(?=[0-9]+$)") %>%
ggplot(aes(x = episode_day, y = sample)) +
geom_line(aes(group = sample), color = "darkgrey") +
geom_point() +
theme_bw()
episode_summary <- sols_recurrent %>%
distinct(info, info_d, vis_date, sample) %>%
mutate(vis_date = mdy(vis_date)) %>%
arrange(info, vis_date) %>%
group_by(info) %>%
mutate(
# get the episode number
episode_number = row_number(),
# calculate days since enrolment, just a check with dates
days_since_enrolment = as.integer(vis_date - min(vis_date[info_d == 0])),
# calculate days since last episode using lag() and making enrolment episodes 0 days since last
days_since_last_episode = replace_na(as.integer(vis_date - lag(vis_date)), 0)
) %>%
ungroup()
We want to remove any haplotypes that appear only once at very low frequencies. Haplotypes should already be filtered to be observed in at least 2 samples and within-host frequency >=1% (in full dataset).
singleton_haps <- sols_recurrent %>%
select(sample, marker_id, haplotype, frequency, count) %>%
count(haplotype) %>%
arrange(n) %>%
filter(n==1) %>%
pull(haplotype)
sols_recurrent %>%
filter(haplotype %in% singleton_haps) %>%
ggplot(aes(x = haplotype, y = count)) +
geom_hline(yintercept = 100, linetype = "dashed") +
geom_point(aes(color = frequency, shape = follow_x), size = 3) +
labs(x = "singleton haplotype",
y = "read count",
color = "within-sample frequency",
shape = "timepoint") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
## Haplotypes for inference As per Aimee, to avoid bias due to
within-host selection of recrudescent parasites, we use only enrolment
episodes to estimate population-level alelle frequencies. So, we will
filter out any haplotypes that do not appear at baseline (drop n=8
haplotypes). Details below on their within-sample frequencies and read
counts.
# n=569 haps
baseline_haps <- sols_recurrent %>%
filter(follow_x == "baseline") %>%
pull(haplotype)
sols_recurrent %>%
filter(!haplotype %in% baseline_haps) %>%
select(sample, marker_id, haplotype, frequency, count) %>%
arrange(frequency, count)
## sample marker_id haplotype frequency count
## 1 AR-081-168 Chr09 Chr09-7 0.1066 716
## 2 AR-343-112 Chr07 Chr07-11 0.2332 2302
## 3 AR-081-140 Chr09 Chr09-7 0.7282 3518
## 4 AR-024-70 Chr05 Chr05-13 0.9918 7028
## 5 AR-024-56 Chr05 Chr05-13 0.9933 4720
## 6 AR-081-168 Chr05 Chr05-10 1.0000 9622
## 7 AR-024-84 Chr05 Chr05-13 1.0000 9693
## 8 AR-081-140 Chr05 Chr05-10 1.0000 9745
analysis_data <- sols_recurrent %>%
select(subject_id = info,
sample_id = sample,
treatment_arm = trt_label,
days_since_treatment = info_d,
timepoint = follow_x,
visit_date = vis_date,
age_years = age_y,
sex = gender,
episodes,
marker_id,
haplotype,
type,
frequency,
mean_moi,
max_moi) %>%
# add extra epi info on episode number and time since last episode
left_join(episode_summary %>% select(subject_id = info,
sample_id = sample,
episode_number,
days_since_enrolment,
days_since_last_episode),
by = c("subject_id", "sample_id")) %>%
# keep only haplotypes present at baseline
filter(haplotype %in% baseline_haps) %>%
# ensure dates are date class
mutate(visit_date = mdy(visit_date))
test_df <- analysis_data %>%
# Use only baseline samples
filter(timepoint == "baseline") %>%
# filter data frame by marker
filter(marker_id == "Chr11")
test_df %>%
mutate(haplotype = factor(haplotype)) %>%
select(sample_id, haplotype, frequency) %>%
pivot_wider(names_from = haplotype,
values_from = frequency,
values_fill = 0) %>%
pivot_longer(cols = -sample_id,
names_to = "haplotype",
values_to = "frequency") %>%
# Get population-level haplotype frequency,
# correcting for when within-individual sum is not equal
# to 1, as can happen when a minority clone is <2%
group_by(sample_id) %>%
mutate(frequency = frequency / sum(frequency, na.rm = TRUE)) %>%
# get population-level mean freq
group_by(haplotype) %>%
summarise(poplevel_mean_freq = mean(frequency, na.rm = TRUE)) %>%
adorn_totals("row")
## Derive baseline allele frequency - This is modified from Thomas/Jason script for our data
baseline_fs <- analysis_data %>%
# Use only baseline samples
filter(timepoint == "baseline") %>%
# split data frame by marker
group_by(marker_id) %>%
group_split() %>%
# Derive a within-marker list of frequencies, by individual
lapply(function(x) {
x <- x %>%
# Build a within-individual frequency table that
# always includes every haplotype (even the ones absent)
mutate(haplotype = factor(haplotype)) %>%
select(sample_id, haplotype, frequency) %>%
pivot_wider(names_from = haplotype,
values_from = frequency,
values_fill = 0) %>%
pivot_longer(cols = -sample_id,
names_to = "haplotype",
values_to = "frequency") %>%
# Get population-level haplotype frequency,
# correcting for when within-individual sum is not equal
# to 1, as can happen when a minority clone is <2%
group_by(sample_id) %>%
mutate(frequency = frequency / sum(frequency, na.rm = TRUE)) %>%
group_by(haplotype) %>%
summarise(frequency_pop_mean = mean(frequency, na.rm = TRUE))
return(deframe(x))
}) %>%
# get marker_id from group_keys from the group dfs
setNames(nm = analysis_data %>% group_by(marker_id) %>% group_keys() %>% pull(marker_id))
# Here we can also save as dataframe for easier printing and table-ready for paper
baseline_fs_df <- analysis_data %>%
# Use only baseline samples
filter(timepoint == "baseline") %>%
# Group by marker_id and sample_id for further calculations
group_by(marker_id, sample_id) %>%
# Build a within-individual frequency table that always includes every haplotype (even the ones absent)
mutate(haplotype = factor(haplotype)) %>%
select(marker_id, sample_id, haplotype, frequency) %>%
pivot_wider(names_from = haplotype, values_from = frequency, values_fill = list(frequency = 0)) %>%
pivot_longer(cols = -c(marker_id, sample_id), names_to = "haplotype", values_to = "frequency") %>%
# Get population-level haplotype frequency, correcting for when within-individual sum is not equal to 1
group_by(marker_id, sample_id) %>%
mutate(frequency = frequency / sum(frequency, na.rm = TRUE)) %>%
group_by(marker_id, haplotype) %>%
summarise(frequency_pop_mean = mean(frequency, na.rm = TRUE), .groups = 'drop') %>%
# Ensure haplotypes are correctly associated with their marker_id - note that this works for us because , in future would have to make this flexible to allow for haplotype names that are not reliant on having marker_id
filter(str_detect(haplotype, marker_id))
baseline_fs_df
## # A tibble: 67 × 3
## marker_id haplotype frequency_pop_mean
## <chr> <chr> <dbl>
## 1 Chr01 Chr01-1 0.534
## 2 Chr01 Chr01-2 0.330
## 3 Chr01 Chr01-3 0.112
## 4 Chr01 Chr01-4 0.0244
## 5 Chr02 Chr02-1 0.228
## 6 Chr02 Chr02-11 0.00467
## 7 Chr02 Chr02-2 0.133
## 8 Chr02 Chr02-3 0.198
## 9 Chr02 Chr02-4 0.191
## 10 Chr02 Chr02-5 0.101
## # ℹ 57 more rows
# This has been modified from Thomas/Jason script
## Pick an individual at random to run Pv3Rs
# indiv_name <- sample(unique(analysis_data$subject_id), size = 1)
indiv_name <- "AR-067"
## Prepare the data
# 1- Subset haplotype data to specific individual and apply filters
indiv_haplotype_data <- analysis_data %>%
# Restrict to a single patient
filter(subject_id == indiv_name) %>%
# Restrict to a subset of markers, if needed
filter(marker_id %in% BENCHMARK_MARKERS) %>%
# Restrict to summed MOI below threshold
group_by(subject_id, episode_number, marker_id) %>%
mutate(MOI_per_marker = sum(n())) %>%
group_by(subject_id, episode_number) %>%
mutate(MOI_per_episode = max(MOI_per_marker, na.rm = TRUE)) %>%
ungroup() %>%
mutate(marker_id = factor(marker_id, levels = BENCHMARK_MARKERS))
# 2- Calculate per-episode and per-participant MOI for PvR3S eligibility
indiv_MOI <- indiv_haplotype_data %>%
select(subject_id, episode_number,
marker_id, starts_with("MOI_")) %>%
distinct() %>%
group_by(subject_id, episode_number) %>%
# Get highest per-marker MOI only for each episode
# (drop marker_id in case of ties with highest per-marker MOI)
select(-marker_id) %>%
distinct() %>%
filter(MOI_per_marker == max(MOI_per_marker, na.rm = TRUE)) %>%
ungroup() %>%
mutate(MOI_summed = sum(MOI_per_episode, na.rm = TRUE))
Prepare test data
# Finish data preparation
indiv_haplotype_data <- indiv_haplotype_data %>%
group_by(episode_number) %>%
group_split() %>%
lapply(function(x) {
res <- x %>%
select(sample_id, episode_number,
marker_id, haplotype, frequency) %>%
# For sensitivity analysis, allow to include/drop allele
# based on their within-individual frequency
filter(frequency >= WITHIN_INDIVIDUAL_ALLELE_FREQ_THR) %>%
select(-sample_id, -episode_number, -frequency) %>%
distinct() %>%
# Prevent dropping of markers that are not characterised
# by setting .drop to FALSE
group_by(marker_id, .drop = FALSE) %>%
group_split() %>%
lapply(function(y) {
unique(y$haplotype)
})
# Returned a list named with each episode,
# setting marker allele to NA in case none are observed
return(lapply(setNames(res, BENCHMARK_MARKERS),
function(y) {
if (length(y) == 0) return(NA) else return(y)
}))
})
# Run Aimee's posterior estimation
indiv_posterior <- Pv3Rs::compute_posterior(y = indiv_haplotype_data,
fs = baseline_fs[BENCHMARK_MARKERS])
## Number of valid relationship graphs (RGs) is 39
## =============================================================================|
## Computing log p(Y|RG) for 39 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
indiv_posterior
## $marg
## C L I
## [1,] 0 0.9971211 0.00287892
##
## $joint
## C L I
## 0.00000000 0.99712108 0.00287892
The joint posterior probability of relapse is 99.7%, when we look at the genetic data, this is in line with the inferred probability of relpase.
sols_recurrent %>%
filter(info == "AR-067") %>%
ggplot(aes(x = haplotype, y = factor(info_d), fill = factor(haplotype))) +
geom_tile() +
facet_grid(~marker_id, scales = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
### Run Pv3Rs one random individual
# This has been modified from Thomas/Jason script
## Pick an individual at random to run Pv3Rs
indiv_name <- sample(unique(analysis_data$subject_id), size = 1)
## Prepare the data
# 1- Subset haplotype data to specific individual and apply filters
indiv_haplotype_data <- analysis_data %>%
# Restrict to a single patient
filter(subject_id == indiv_name) %>%
# Restrict to a subset of markers, if needed
filter(marker_id %in% BENCHMARK_MARKERS) %>%
# Restrict to summed MOI below threshold
group_by(subject_id, episode_number, marker_id) %>%
mutate(MOI_per_marker = sum(n())) %>%
group_by(subject_id, episode_number) %>%
mutate(MOI_per_episode = max(MOI_per_marker, na.rm = TRUE)) %>%
ungroup() %>%
mutate(marker_id = factor(marker_id, levels = BENCHMARK_MARKERS))
# 2- Calculate per-episode and per-participant MOI for PvR3S eligibility
indiv_MOI <- indiv_haplotype_data %>%
select(subject_id, episode_number,
marker_id, starts_with("MOI_")) %>%
distinct() %>%
group_by(subject_id, episode_number) %>%
# Get highest per-marker MOI only for each episode
# (drop marker_id in case of ties with highest per-marker MOI)
select(-marker_id) %>%
distinct() %>%
filter(MOI_per_marker == max(MOI_per_marker, na.rm = TRUE)) %>%
ungroup() %>%
mutate(MOI_summed = sum(MOI_per_episode, na.rm = TRUE))
Now that the data has been subset, Pv3Rs::compute_posterior() will be called (only if the participant meets eligibility criteria as defined by the global variables set earlier in this report).
if (RUN_EXAMPLE & unique(indiv_MOI$MOI_summed) <= MAX_MOI_TO_INCLUDE) {
# Verbose
cat("Running Pv3Rs for: ",
indiv_name,
" (summed MOI = ",
unique(indiv_MOI$MOI_summed),
").\n",
sep = "")
# Finish data preparation
indiv_haplotype_data <- indiv_haplotype_data %>%
group_by(episode_number) %>%
group_split() %>%
lapply(function(x) {
res <- x %>%
select(sample_id, episode_number,
marker_id, haplotype, frequency) %>%
# For sensitivity analysis, allow to include/drop allele
# based on their within-individual frequency
filter(frequency >= WITHIN_INDIVIDUAL_ALLELE_FREQ_THR) %>%
select(-sample_id, -episode_number, -frequency) %>%
distinct() %>%
# Prevent dropping of markers that are not characterised
# by setting .drop to FALSE
group_by(marker_id, .drop = FALSE) %>%
group_split() %>%
lapply(function(y) {
unique(y$haplotype)
})
# Returned a list named with each episode,
# setting marker allele to NA in case none are observed
return(lapply(setNames(res, BENCHMARK_MARKERS),
function(y) {
if (length(y) == 0) return(NA) else return(y)
}))
})
# Run Aimee's posterior estimation
indiv_posterior <- Pv3Rs::compute_posterior(y = indiv_haplotype_data,
fs = baseline_fs[BENCHMARK_MARKERS])
} else {
# Verbose
cat("NOT Running Pv3Rs for: ",
indiv_name,
" because either summed MOI exceeds threshold (observed = ",
unique(indiv_MOI$MOI_summed),
", MAX_MOI_TO_INCLUDE = ",
MAX_MOI_TO_INCLUDE,
"), or RUN_EXAMPLE was set to FALSE.\n",
sep = "")
# Return NULL
indiv_posterior <- NULL
}
indiv_posterior
sols_recurrent %>%
filter(info == indiv_name) %>%
ggplot(aes(x = haplotype, y = factor(info_d), fill = factor(haplotype))) +
geom_tile() +
facet_grid(~marker_id, scales = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
# Start timer
t_start <- Sys.time()
# Initialize an empty list to store the results
indiv_posteriors <- list()
# Loop through each unique individual name
for (indiv_name in unique(analysis_data$subject_id)) {
# Verbose
cat("ID : ", indiv_name, "...\n", sep = "")
## Prepare the data
# 1- Subset haplotype data to specific individual and apply filters
indiv_haplotype_data <- analysis_data %>%
# Restrict to a single patient
filter(subject_id == indiv_name) %>%
# Ensure episodes are in order
arrange(episode_number) %>%
# Restrict to a subset of markers
filter(marker_id %in% BENCHMARK_MARKERS) %>%
# Restrict to summed MOI below threshold
group_by(subject_id, episode_number, marker_id) %>%
mutate(MOI_per_marker = sum(n())) %>%
group_by(subject_id, episode_number) %>%
mutate(MOI_per_episode = max(MOI_per_marker, na.rm = TRUE)) %>%
ungroup() %>%
mutate(marker_id = factor(marker_id, levels = BENCHMARK_MARKERS))
# 2- Calculate per-episode and per-participant MOI for PvR3S eligibility
indiv_MOI <- indiv_haplotype_data %>%
select(subject_id, episode_number,
marker_id, starts_with("MOI_")) %>%
distinct() %>%
group_by(subject_id, episode_number) %>%
# Get highest per-marker MOI only for each episode
# (drop marker_id in case of ties with highest per-marker MOI)
select(-marker_id) %>%
distinct() %>%
filter(MOI_per_marker == max(MOI_per_marker, na.rm = TRUE)) %>%
ungroup() %>%
mutate(MOI_summed = sum(MOI_per_episode, na.rm = TRUE))
## Run Pv3Rs only if MOI below threshold
if (unique(indiv_MOI$MOI_summed) <= MAX_MOI_TO_INCLUDE) {
# Verbose
cat("Running Pv3Rs for: ",
indiv_name,
" (summed MOI = ",
unique(indiv_MOI$MOI_summed),
").\n",
sep = "")
# Preserve episode numbers for naming the output list
indiv_episode <- unique(indiv_haplotype_data$episode_number)
cat("The episode number is: ", indiv_episode) # printing to check episode order number is correct
# Finish data preparation
indiv_haplotype_data <- indiv_haplotype_data %>%
group_by(episode_number) %>%
group_split() %>%
lapply(function(x) {
res <- x %>%
select(sample_id, episode_number,
marker_id, haplotype, frequency) %>%
# For sensitivity analysis, allow to include/drop allele
# based on their within-individual frequency
filter(frequency >= WITHIN_INDIVIDUAL_ALLELE_FREQ_THR) %>%
select(-sample_id, -episode_number, -frequency) %>%
distinct() %>%
# Prevent dropping of markers that are not characterized
# by setting .drop to FALSE
group_by(marker_id, .drop = FALSE) %>%
group_split() %>%
lapply(function(y) {
unique(y$haplotype)
})
# Returned a list named with each episode,
# setting marker allele to NA in case none are observed
return(lapply(setNames(res, BENCHMARK_MARKERS),
function(y) {
if (length(y) == 0) return(NA) else return(y)
}))
})
# Run Aimee's posterior estimation
indiv_posterior <- compute_posterior(y = indiv_haplotype_data,
fs = baseline_fs[BENCHMARK_MARKERS],
prior = matrix(PRIOR_3RS,
nrow = length(indiv_haplotype_data),
ncol = length(PRIOR_3RS),
byrow = TRUE,
dimnames = list(c(1:length(indiv_haplotype_data)),
names(PRIOR_3RS))))
} else {
# Verbose
cat("NOT Running Pv3Rs for: ",
indiv_name,
" because summed MOI exceeds threshold (observed = ",
unique(indiv_MOI$MOI_summed),
", MAX_MOI_TO_INCLUDE = ",
MAX_MOI_TO_INCLUDE,
").\n",
sep = "")
# Return NULL
indiv_episode <- unique(indiv_haplotype_data$episode_number)
indiv_posterior <- NULL
}
# Append the results to the list
indiv_posteriors[[indiv_name]] <- list("subject_id" = indiv_name,
"episode_number" = indiv_episode,
"Pv3Rs" = indiv_posterior)
}
## ID : AR-024...
## Running Pv3Rs for: AR-024 (summed MOI = 7).
## The episode number is: 1 2 3 4 5
## Number of valid relationship graphs (RGs) is 14718
## =============================================================================|
## Computing log p(Y|RG) for 14718 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-221...
## Running Pv3Rs for: AR-221 (summed MOI = 3).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 9
## =============================================================================|
## Computing log p(Y|RG) for 9 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-343...
## Running Pv3Rs for: AR-343 (summed MOI = 4).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 39
## =============================================================================|
## Computing log p(Y|RG) for 39 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-366...
## Running Pv3Rs for: AR-366 (summed MOI = 4).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 30
## =============================================================================|
## Computing log p(Y|RG) for 30 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-187...
## Running Pv3Rs for: AR-187 (summed MOI = 5).
## The episode number is: 1 2 3 4
## Number of valid relationship graphs (RGs) is 298
## =============================================================================|
## Computing log p(Y|RG) for 298 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-383...
## Running Pv3Rs for: AR-383 (summed MOI = 2).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 3
## =============================================================================|
## Computing log p(Y|RG) for 3 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-288...
## Running Pv3Rs for: AR-288 (summed MOI = 4).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 39
## =============================================================================|
## Computing log p(Y|RG) for 39 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-234...
## Running Pv3Rs for: AR-234 (summed MOI = 5).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 172
## =============================================================================|
## Computing log p(Y|RG) for 172 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-306...
## Running Pv3Rs for: AR-306 (summed MOI = 5).
## The episode number is: 1 2 3
## Number of valid relationship graphs (RGs) is 250
## =============================================================================|
## Computing log p(Y|RG) for 250 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-302...
## Running Pv3Rs for: AR-302 (summed MOI = 4).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 39
## =============================================================================|
## Computing log p(Y|RG) for 39 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-341...
## Running Pv3Rs for: AR-341 (summed MOI = 5).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 172
## =============================================================================|
## Computing log p(Y|RG) for 172 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-326...
## Running Pv3Rs for: AR-326 (summed MOI = 5).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 172
## =============================================================================|
## Computing log p(Y|RG) for 172 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-067...
## Running Pv3Rs for: AR-067 (summed MOI = 4).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 39
## =============================================================================|
## Computing log p(Y|RG) for 39 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-322...
## Running Pv3Rs for: AR-322 (summed MOI = 3).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 9
## =============================================================================|
## Computing log p(Y|RG) for 9 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-355...
## Running Pv3Rs for: AR-355 (summed MOI = 3).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 9
## =============================================================================|
## Computing log p(Y|RG) for 9 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-081...
## Running Pv3Rs for: AR-081 (summed MOI = 7).
## The episode number is: 1 2 3
## Number of valid relationship graphs (RGs) is 9773
## =============================================================================|
## Computing log p(Y|RG) for 9773 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-248...
## Running Pv3Rs for: AR-248 (summed MOI = 4).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 39
## =============================================================================|
## Computing log p(Y|RG) for 39 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-304...
## Running Pv3Rs for: AR-304 (summed MOI = 5).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 172
## =============================================================================|
## Computing log p(Y|RG) for 172 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-160...
## Running Pv3Rs for: AR-160 (summed MOI = 2).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 3
## =============================================================================|
## Computing log p(Y|RG) for 3 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-110...
## Running Pv3Rs for: AR-110 (summed MOI = 2).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 3
## =============================================================================|
## Computing log p(Y|RG) for 3 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-133...
## Running Pv3Rs for: AR-133 (summed MOI = 5).
## The episode number is: 1 2 3
## Number of valid relationship graphs (RGs) is 202
## =============================================================================|
## Computing log p(Y|RG) for 202 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-140...
## Running Pv3Rs for: AR-140 (summed MOI = 7).
## The episode number is: 1 2 3
## Number of valid relationship graphs (RGs) is 9773
## =============================================================================|
## Computing log p(Y|RG) for 9773 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-142...
## Running Pv3Rs for: AR-142 (summed MOI = 8).
## The episode number is: 1 2 3 4
## Number of valid relationship graphs (RGs) is 91237
## =============================================================================|
## Computing log p(Y|RG) for 91237 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-173...
## Running Pv3Rs for: AR-173 (summed MOI = 3).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 9
## =============================================================================|
## Computing log p(Y|RG) for 9 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-328...
## Running Pv3Rs for: AR-328 (summed MOI = 5).
## The episode number is: 1 2 3
## Number of valid relationship graphs (RGs) is 250
## =============================================================================|
## Computing log p(Y|RG) for 250 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-359...
## NOT Running Pv3Rs for: AR-359 because summed MOI exceeds threshold (observed = 11, MAX_MOI_TO_INCLUDE = 8).
## ID : AR-325...
## Running Pv3Rs for: AR-325 (summed MOI = 2).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 3
## =============================================================================|
## Computing log p(Y|RG) for 3 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-246...
## Running Pv3Rs for: AR-246 (summed MOI = 2).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 3
## =============================================================================|
## Computing log p(Y|RG) for 3 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-250...
## Running Pv3Rs for: AR-250 (summed MOI = 3).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 9
## =============================================================================|
## Computing log p(Y|RG) for 9 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-258...
## Running Pv3Rs for: AR-258 (summed MOI = 5).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 172
## =============================================================================|
## Computing log p(Y|RG) for 172 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-098...
## Running Pv3Rs for: AR-098 (summed MOI = 3).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 9
## =============================================================================|
## Computing log p(Y|RG) for 9 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-176...
## Running Pv3Rs for: AR-176 (summed MOI = 5).
## The episode number is: 1 2 3
## Number of valid relationship graphs (RGs) is 250
## =============================================================================|
## Computing log p(Y|RG) for 250 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-192...
## Running Pv3Rs for: AR-192 (summed MOI = 3).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 9
## =============================================================================|
## Computing log p(Y|RG) for 9 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-214...
## Running Pv3Rs for: AR-214 (summed MOI = 6).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 799
## =============================================================================|
## Computing log p(Y|RG) for 799 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-237...
## Running Pv3Rs for: AR-237 (summed MOI = 3).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 9
## =============================================================================|
## Computing log p(Y|RG) for 9 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-274...
## Running Pv3Rs for: AR-274 (summed MOI = 4).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 39
## =============================================================================|
## Computing log p(Y|RG) for 39 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-300...
## Running Pv3Rs for: AR-300 (summed MOI = 4).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 39
## =============================================================================|
## Computing log p(Y|RG) for 39 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-345...
## Running Pv3Rs for: AR-345 (summed MOI = 4).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 39
## =============================================================================|
## Computing log p(Y|RG) for 39 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-346...
## Running Pv3Rs for: AR-346 (summed MOI = 5).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 172
## =============================================================================|
## Computing log p(Y|RG) for 172 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-348...
## Running Pv3Rs for: AR-348 (summed MOI = 2).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 3
## =============================================================================|
## Computing log p(Y|RG) for 3 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
## ID : AR-382...
## Running Pv3Rs for: AR-382 (summed MOI = 2).
## The episode number is: 1 2
## Number of valid relationship graphs (RGs) is 3
## =============================================================================|
## Computing log p(Y|RG) for 3 RGs
## =============================================================================|
## Finding log-likelihood of each vector of recurrent states
## =============================================================================|
# End timer
t_end <- Sys.time()
cat("Pv3Rs for the whole dataset took : ", as.numeric(difftime(time1 = t_end,
time2 = t_start,
units = "secs"))/60, " mins", "\n", sep = "")
## Pv3Rs for the whole dataset took : 1.561027 mins
# Present the marginal data in a clearer format
indiv_posteriors_marginal <- do.call(rbind,
lapply(indiv_posteriors, function(x) {
if (!is.null(x[["Pv3Rs"]])) {
return(data.frame("subject_id" = x[["subject_id"]],
"episode_number" = x[["episode_number"]][-1],
"Posterior_marginal_prob_C" = x[["Pv3Rs"]]$marg[, "C"],
"Posterior_marginal_prob_L" = x[["Pv3Rs"]]$marg[, "L"],
"Posterior_marginal_prob_I" = x[["Pv3Rs"]]$marg[, "I"]))
} else {
return(NULL)
}
}))
row.names(indiv_posteriors_marginal) <- 1:nrow(indiv_posteriors_marginal)
# Present the joint posterior estimates in a clearer format
indiv_posteriors_joint <- do.call(rbind,
lapply(indiv_posteriors, function(x) {
if (!is.null(x[["Pv3Rs"]])) {
joint_probs <- x[["Pv3Rs"]]$joint
# Extract the state pairs and probabilities
state_pairs <- names(joint_probs)
prob_values <- as.numeric(joint_probs)
# Create a data frame with subject_id, episode_number, and joint probabilities
return(data.frame("subject_id" = rep(x[["subject_id"]], length(state_pairs)),
"episode_number" = rep(x[["episode_number"]][-1], each = length(state_pairs)),
"state_pair" = state_pairs,
"joint_probability" = prob_values))
} else {
return(NULL)
}
}))
row.names(indiv_posteriors_joint) <- 1:nrow(indiv_posteriors_joint)
# Save Pv3Rs output because it's time consuming and
# we don't want to re-run it every time.
save(list = c("RUN_EXAMPLE", "WITHIN_INDIVIDUAL_ALLELE_FREQ_THR", "BENCHMARK_MARKERS", "MAX_MOI_TO_INCLUDE", "PRIOR_3RS",
"analysis_data", "baseline_fs",
"indiv_posteriors", "indiv_posteriors_marginal", "indiv_posteriors_joint"),
file = paste0("./outputs/Pv3Rs_sols_posteriors_",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".RData"))
The marginal probabilities give us the probability of the three states for each recurrent episode. However, this does not consider the joint probability of different states when a person experienced more than one recurrent episode.
indiv_posteriors_marginal
## subject_id episode_number Posterior_marginal_prob_C
## 1 AR-024 2 0.0000000
## 2 AR-024 3 0.7576302
## 3 AR-024 4 0.8623076
## 4 AR-024 5 0.0000000
## 5 AR-221 2 0.0000000
## 6 AR-343 2 0.0000000
## 7 AR-366 2 0.0000000
## 8 AR-187 2 0.7721759
## 9 AR-187 3 0.7721759
## 10 AR-187 4 0.0000000
## 11 AR-383 2 0.0000000
## 12 AR-288 2 0.0000000
## 13 AR-234 2 0.0000000
## 14 AR-306 2 0.0000000
## 15 AR-306 3 0.7913832
## 16 AR-302 2 0.0000000
## 17 AR-341 2 0.0000000
## 18 AR-326 2 0.0000000
## 19 AR-067 2 0.0000000
## 20 AR-322 2 0.0000000
## 21 AR-355 2 0.0000000
## 22 AR-081 2 0.0000000
## 23 AR-081 3 0.0000000
## 24 AR-248 2 0.0000000
## 25 AR-304 2 0.0000000
## 26 AR-160 2 0.7485703
## 27 AR-110 2 0.7479272
## 28 AR-133 2 0.8615406
## 29 AR-133 3 0.0000000
## 30 AR-140 2 0.0000000
## 31 AR-140 3 0.0000000
## 32 AR-142 2 0.0000000
## 33 AR-142 3 0.0000000
## 34 AR-142 4 0.0000000
## 35 AR-173 2 0.0000000
## 36 AR-328 2 0.0000000
## 37 AR-328 3 0.0000000
## 38 AR-325 2 0.0000000
## 39 AR-246 2 0.0000000
## 40 AR-250 2 0.6878086
## 41 AR-258 2 0.0000000
## 42 AR-098 2 0.0000000
## 43 AR-176 2 0.0000000
## 44 AR-176 3 0.0000000
## 45 AR-192 2 0.6883825
## 46 AR-214 2 0.0000000
## 47 AR-237 2 0.6339993
## 48 AR-274 2 0.0000000
## 49 AR-300 2 0.0000000
## 50 AR-345 2 0.0000000
## 51 AR-346 2 0.8462352
## 52 AR-348 2 0.7479272
## 53 AR-382 2 0.7451339
## Posterior_marginal_prob_L Posterior_marginal_prob_I
## 1 0.20029039 7.997096e-01
## 2 0.24236975 1.780460e-08
## 3 0.13769233 2.870620e-08
## 4 0.04708870 9.529113e-01
## 5 0.99999919 8.095861e-07
## 6 0.09761816 9.023818e-01
## 7 0.49131418 5.086858e-01
## 8 0.22782070 3.361901e-06
## 9 0.22782283 1.232510e-06
## 10 0.15668331 8.433167e-01
## 11 0.26708459 7.329154e-01
## 12 0.09302359 9.069764e-01
## 13 0.95411195 4.588805e-02
## 14 0.10514624 8.948538e-01
## 15 0.20861539 1.457631e-06
## 16 0.62091008 3.790899e-01
## 17 0.05713046 9.428695e-01
## 18 0.89692628 1.030737e-01
## 19 0.99712108 2.878920e-03
## 20 0.20073604 7.992640e-01
## 21 0.18443518 8.155648e-01
## 22 0.34465510 6.553449e-01
## 23 0.99999318 6.820487e-06
## 24 0.99999371 6.292255e-06
## 25 0.05497134 9.450287e-01
## 26 0.25142969 2.054618e-08
## 27 0.25207193 8.834381e-07
## 28 0.13805319 4.061912e-04
## 29 0.99995771 4.228845e-05
## 30 0.31585812 6.841419e-01
## 31 0.99847214 1.527856e-03
## 32 0.09188394 9.081161e-01
## 33 0.87083346 1.291665e-01
## 34 0.99922528 7.747186e-04
## 35 0.85959432 1.404057e-01
## 36 0.22064240 7.793576e-01
## 37 0.31818593 6.818141e-01
## 38 0.99994032 5.967860e-05
## 39 0.26710484 7.328952e-01
## 40 0.31219012 1.300991e-06
## 41 0.06281198 9.371880e-01
## 42 0.99999479 5.206603e-06
## 43 0.25588975 7.441102e-01
## 44 0.99999554 4.459706e-06
## 45 0.31161734 1.483804e-07
## 46 0.71125930 2.887407e-01
## 47 0.36517961 8.210690e-04
## 48 0.09302472 9.069753e-01
## 49 0.09302334 9.069767e-01
## 50 0.09507480 9.049252e-01
## 51 0.15376481 2.089266e-11
## 52 0.25207193 8.834381e-07
## 53 0.25485692 9.180472e-06
The joint probabilites give us values for each possible combination of states, depending on the number of recurrent episodes experienced by the participant.
joint_summary <- indiv_posteriors_joint %>%
group_by(subject_id, state_pair, joint_probability) %>%
filter(episode_number == max(episode_number)) %>%
mutate(percentage = round(joint_probability*100, 3),
total_recurrences = episode_number-1) %>%
select(-episode_number) %>%
arrange(subject_id, total_recurrences, percentage) %>%
relocate(total_recurrences, .before = state_pair)
joint_summary
## # A tibble: 282 × 5
## # Groups: subject_id, state_pair, joint_probability [282]
## subject_id total_recurrences state_pair joint_probability percentage
## <chr> <dbl> <chr> <dbl> <dbl>
## 1 AR-024 4 CCCC 0 0
## 2 AR-024 4 LCCC 0 0
## 3 AR-024 4 ICCC 0 0
## 4 AR-024 4 CLCC 0 0
## 5 AR-024 4 LLCC 0 0
## 6 AR-024 4 ILCC 0 0
## 7 AR-024 4 CICC 0 0
## 8 AR-024 4 LICC 0 0
## 9 AR-024 4 IICC 0 0
## 10 AR-024 4 CCLC 0 0
## # ℹ 272 more rows
marginal_summary <- indiv_posteriors_marginal %>%
pivot_longer(cols = !subject_id & !episode_number,
names_to = "posterior_type",
values_to = "posterior_value") %>%
mutate(posterior_classification = case_when(posterior_type == "Posterior_marginal_prob_C" ~ "Recrudescence",
posterior_type == "Posterior_marginal_prob_L" ~ "Relapse",
posterior_type == "Posterior_marginal_prob_I" ~ "Reinfection"),
posterior_classification = factor(posterior_classification,
levels = c("Relapse", "Recrudescence", "Reinfection"))) %>%
select(-posterior_type)
marginal_summary
## # A tibble: 159 × 4
## subject_id episode_number posterior_value posterior_classification
## <chr> <int> <dbl> <fct>
## 1 AR-024 2 0 Recrudescence
## 2 AR-024 2 0.200 Relapse
## 3 AR-024 2 0.800 Reinfection
## 4 AR-024 3 0.758 Recrudescence
## 5 AR-024 3 0.242 Relapse
## 6 AR-024 3 0.0000000178 Reinfection
## 7 AR-024 4 0.862 Recrudescence
## 8 AR-024 4 0.138 Relapse
## 9 AR-024 4 0.0000000287 Reinfection
## 10 AR-024 5 0 Recrudescence
## # ℹ 149 more rows
marginal_summary %>%
group_by(subject_id) %>%
arrange(desc(posterior_value), .by_group = TRUE) %>%
mutate(subject_id = factor(subject_id, levels = unique(subject_id[order(posterior_value, decreasing = TRUE)]))) %>%
ggplot(aes(x = factor(episode_number), y = posterior_value, group = episode_number, fill = posterior_classification)) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("Relapse" = "turquoise3",
"Recrudescence" = "skyblue4",
"Reinfection" = "magenta3")) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Episode number",
y = "Posterior probability",
fill = "") +
theme_bw() +
facet_wrap(~subject_id)
marginal_summary %>%
left_join(analysis_data %>% select(subject_id, treatment_arm) %>% distinct(),
by = "subject_id") %>%
group_by(subject_id) %>%
arrange(desc(posterior_value), .by_group = TRUE) %>%
mutate(subject_id = factor(subject_id, levels = unique(subject_id[order(posterior_value, decreasing = TRUE)]))) %>%
ggplot(aes(x = factor(episode_number), y = posterior_value, group = episode_number, fill = posterior_classification)) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("Relapse" = "turquoise3",
"Recrudescence" = "skyblue4",
"Reinfection" = "magenta3")) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Episode number",
y = "Posterior probability",
fill = "") +
theme_bw() +
facet_wrap(treatment_arm ~ subject_id)
marginal_summary %>%
left_join(analysis_data %>% select(subject_id, episode_number, treatment_arm, visit_date, days_since_enrolment, days_since_last_episode) %>% distinct(),
by = c("subject_id", "episode_number")) %>%
group_by(subject_id) %>%
arrange(desc(posterior_value), .by_group = TRUE) %>%
mutate(subject_id = factor(subject_id, levels = unique(subject_id[order(posterior_value, decreasing = TRUE)]))) %>%
ggplot(aes(x = days_since_enrolment, y = posterior_value, group = episode_number, fill = posterior_classification)) +
geom_bar(stat = "identity", position = "fill", width = 3) +
scale_fill_manual(values = c("Relapse" = "turquoise3",
"Recrudescence" = "skyblue4",
"Reinfection" = "magenta3")) +
scale_x_continuous(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Days since enrolment",
y = "Posterior probability",
fill = "") +
theme_bw() +
facet_wrap(~subject_id)
## Warning: `position_stack()` requires non-overlapping x intervals.
pal1 <- c(
"C" = "turquoise3",
"I" = "magenta3",
"L" = "skyblue4")
pal2 <- c(
"II" = "magenta3",
"IL" = "darkorange2",
"LI" = "goldenrod2",
"LL" = "skyblue4",
"CI" = "saddlebrown",
"CL" = "turquoise4",
"IC" = "darkolivegreen",
"LC" = "lightseagreen"
)
pal3 <- c(
"IIL" = "darkorange3",
"ILL" = "orangered3",
"LIL" = "goldenrod3",
"LLL" = "skyblue4",
"CCI" = "sienna4",
"CCL" = "cadetblue3",
"CLI" = "darkcyan",
"CLL" = "steelblue3",
"LCI" = "peru",
"LCL" = "powderblue",
"LLI" = "orange3"
)
pal4 <- c(
"ICCI" = "saddlebrown",
"ICCL" = "darkseagreen",
"ICLI" = "orange3",
"ICLL" = "lightsteelblue",
"ILCI" = "darkkhaki",
"ILCL" = "skyblue3",
"ILLI" = "gold3",
"LCCI" = "rosybrown",
"LCCL" = "lightblue3",
"LCLI" = "goldenrod3",
"LCLL" = "deepskyblue3",
"LLCI" = "tan",
"LLCL" = "mediumturquoise",
"LLLI" = "darkorange3"
)
combined_pal <- c(pal1, pal2, pal3, pal4)
plotJointProb <- function(data, recurrence_n, prob_threshold, color_palette, ...){
data %>%
filter(total_recurrences == recurrence_n, joint_probability > prob_threshold) %>%
group_by(subject_id) %>%
mutate(state_pair = fct_reorder(state_pair, joint_probability)) %>%
ungroup() %>%
ggplot(aes(x = joint_probability,
y = subject_id,
fill = state_pair)) +
geom_bar(position = "stack", stat = "identity") +
scale_x_continuous(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
scale_fill_manual(values = color_palette) +
labs(x = "Joint probability estimate",
y = "Participant",
fill = "Classification state") +
theme_bw() +
facet_wrap(~total_recurrences, scales = "free_y")
}
plot_joint1 <- plotJointProb(joint_summary, 1, 0, pal1)
plot_joint2 <- plotJointProb(joint_summary, 2, 0, pal2) + guides(fill = guide_legend(ncol = 2))
plot_joint3 <- plotJointProb(joint_summary, 3, 0.001, pal3) + guides(fill = guide_legend(ncol = 3))
plot_joint4 <- plotJointProb(joint_summary, 4, 0.001, pal4) + guides(fill = guide_legend(ncol = 3))
(plot_joint1 | plot_joint2) / (plot_joint3 | plot_joint4) +
plot_annotation(subtitle = "I = Reinfection, L = Relapse, C = Recrudescence",
caption = expression(italic("For >2 recurrences, only probability estimates > 0.001 are shown"))) +
plot_layout(heights = c(2, 1))
joint_summary %>%
left_join(analysis_data %>% select(subject_id, treatment_arm) %>% distinct(),
by = "subject_id") %>%
filter(joint_probability > 0.001) %>%
group_by(subject_id) %>%
mutate(state_pair = fct_reorder(state_pair, joint_probability)) %>%
ungroup() %>%
ggplot(aes(x = joint_probability,
y = reorder(subject_id, total_recurrences),
fill = state_pair)) +
geom_bar(position = "stack", stat = "identity") +
scale_x_continuous(expand = c(0, 0)) +
scale_y_discrete(expand = c(0, 0)) +
scale_fill_manual(values = combined_pal) +
labs(x = "Joint probability estimate",
y = "Participant",
fill = "Classification state",
subtitle = "I = Reinfection, L = Relapse, C = Recrudescence",
caption = expression(italic("Color coding: if > 1 recurrence, blue shades denote combinations of relapse and recrudescence (L and C), shades of orange/browns denote \ncombos of reinfection and relapse (I and L) and green shades denote combos of reinfection with recrudescence (I and C)"))) +
theme_bw() +
facet_wrap(treatment_arm~total_recurrences, scales = "free") +
# facet_wrap(total_recurrences~treatment_arm, scales = "free") +
theme(plot.caption = element_text(hjust = 0, vjust = -2))
What is the joint probability estimate for the ‘highest ranking’ state pair classification? I.e. if >80% relatively high confidence? n=19
joint_summary %>%
filter(joint_probability > 0.001) %>%
group_by(subject_id) %>%
# get the highest prob for each subj and store ranking
mutate(ranking = rank(-joint_probability, ties.method = "first")) %>%
arrange(subject_id, ranking) %>%
filter(ranking == 1) %>%
ggplot(aes(x = reorder(subject_id, -joint_probability),
y = joint_probability,
fill = factor(total_recurrences))) +
geom_col() +
geom_hline(yintercept = 0.8, color = "darkgray", linetype = "dashed") +
labs(x = "Participant",
y = "Joint probability estimate for highest 'ranking' state pair",
fill = "Number of recurrent episodes",
caption = expression(italic("Dashed line indicates an arbitrary threshold of 80%"))) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
How many state pairs make up the arbitrary 80% threshold?
joint_summary %>%
filter(joint_probability > 0.001) %>%
group_by(subject_id) %>%
# Get the highest prob for each subject and store ranking
mutate(ranking = rank(-joint_probability, ties.method = "first")) %>%
# reverse ranking order of ranking so we plot the highest rank at the bottom
arrange(subject_id, desc(ranking)) %>%
mutate(ranking = factor(ranking, levels = unique(ranking))) %>%
ggplot(aes(x = subject_id,
y = joint_probability,
fill = ranking)) +
geom_bar(stat = "identity", position = "stack") +
geom_hline(yintercept = 0.8, color = "darkgray", linetype = "dashed") +
scale_fill_manual(values = colorRampPalette(brewer.pal(12, "Paired"))(14),
guide = guide_legend(reverse = T)) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Participant",
y = "Joint probability estimate for state pair",
fill = "Number of possible \nestimated state pairs",
caption = expression(italic("Dashed line indicates an arbitrary threshold of 80%"))) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
facet_wrap(~total_recurrences, scales = "free_x")
joint_top3_summary <- joint_summary %>%
select(-percentage) %>%
group_by(subject_id) %>%
# Get the highest prob for each subject and store ranking
mutate(ranking = rank(-joint_probability, ties.method = "first")) %>%
arrange(subject_id, ranking) %>%
filter(ranking %in% c(1, 2, 3))
We will keep the ‘highest ranking’ joint probability estimate so we can compare the state classification for each episode with other metrics
joint_recurrence_summary <- joint_top3_summary %>%
select(-total_recurrences) %>%
filter(ranking == 1) %>%
mutate(state_pair_split = strsplit(state_pair, "")) %>% # Split state_pair into individual letters
unnest(state_pair_split) %>% # Unnest to create a new row for each letter
group_by(subject_id) %>%
mutate(
episode_number = row_number() + 1, # Create episode number starting from n+1 to reflect number of recurrence
joint_probability = joint_probability, # Keep the same joint_probability for all episodes\
state_classification = case_when(state_pair_split == "L" ~ "Relapse",
state_pair_split == "C" ~ "Recrudescence",
state_pair_split == "I" ~ "Reinfection")) %>%
select(subject_id, episode_number, state_classification, joint_probability)
joint_recurrence_summary %>%
ggplot(aes(x = factor(episode_number), y = joint_probability, fill = state_classification)) +
geom_col() +
scale_fill_manual(values = c("Relapse" = "turquoise3",
"Recrudescence" = "skyblue4",
"Reinfection" = "magenta3")) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Episode number",
y = "Posterior joint probability",
fill = "") +
theme_bw() +
facet_wrap(~subject_id)
posterior_summary <- marginal_summary %>%
rename("marginal_posterior_probability" = "posterior_value",
"marginal_classification" = "posterior_classification") %>%
left_join(joint_recurrence_summary %>%
rename("joint_classification" = "state_classification",
"joint_posterior_probability" = "joint_probability"),
by = c("subject_id", "episode_number")) %>%
pivot_longer(cols = c(marginal_posterior_probability,
joint_posterior_probability,
marginal_classification,
joint_classification),
names_to = c("estimate_type", ".value"),
names_pattern = "(marginal|joint)_(posterior_probability|classification)") %>%
distinct()
posterior_summary %>%
group_by(subject_id, episode_number) %>%
arrange(desc(posterior_probability), .by_group = TRUE) %>%
mutate(classification = factor(classification, levels = unique(classification))) %>%
ungroup() %>%
ggplot(aes(x = factor(episode_number), y = posterior_probability, group = episode_number)) +
geom_bar(data = . %>% filter(estimate_type == "marginal"),
aes(fill = classification),
stat = "identity",
position = "fill") +
geom_segment(data = . %>% filter(estimate_type == "joint"),
aes(x = as.numeric(factor(episode_number)) - 0.4,
xend = as.numeric(factor(episode_number)) + 0.4,
y = posterior_probability,
yend = posterior_probability,
color = classification),
size = 1.5) +
geom_point(data = . %>% filter(estimate_type == "joint"),
aes(color = classification,
fill = classification),
shape = 21) +
scale_fill_manual(values = c("Relapse" = "turquoise3",
"Recrudescence" = "skyblue4",
"Reinfection" = "magenta3")) +
scale_color_manual(values = c("Relapse" = "#006064",
"Recrudescence" = "#3B4F75",
"Reinfection" = "#8B008B")) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Episode number",
y = "Posterior probability",
fill = "Marginal classification",
color = "Joint probability estimate \n and classification") +
theme_bw() +
facet_wrap(~subject_id)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
sols_pnh <- read.csv(here("data/final", "sols_PNH_data.csv"))
ibd_r_data <- readRDS(here("data/final", "sols_all_meta.rds"))
ibd_rhat_data <- readRDS(here("data/final", "sols_sig_meta.rds"))
For direct comparison with the we can only look at the paired comparisons between baseline samples and follow-up samples, but not other pw comparisons for a given participant.
sols_ibd <- ibd_r_data %>%
# filter to include only paired samples, and only pw comparisons to baseline
filter(comparison_type == "paired", day_id2 == 0) %>%
mutate(subject_id = paste0("AR-", patientid1),
day_id1 = as.integer(day_id1),
day_id2 = as.integer(day_id2)) %>%
group_by(subject_id) %>%
arrange(day_id1) %>%
# create episode_number accounting for baseline as episode 1
mutate(episode_number = row_number() + 1) %>%
ungroup()
classification_summary <- indiv_posteriors_marginal %>%
rename("posterior_probability_recrudescence" = "Posterior_marginal_prob_C",
"posterior_probability_relapse" = "Posterior_marginal_prob_L",
"posterior_probability_reinfection" = "Posterior_marginal_prob_I") %>%
# merge with epi variables
left_join(analysis_data %>% select(subject_id, sample_id, episode_number, treatment_arm, visit_date, days_since_enrolment, days_since_last_episode) %>% distinct(),
by = c("subject_id", "episode_number")) %>%
# merge with PNH data
left_join(sols_pnh, by = c("subject_id" = "Patient", "episode_number" = "recurrence")) %>%
rename("prop_same_haps_1minusPNH" = "Prop_het",
"pnh_range" = "identity",
"classification" = "Classification") %>%
# create bins for 1-PNH (IBS metric)
mutate(ibs_range = cut(prop_same_haps_1minusPNH,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1"))) %>%
# merge with IBD r
left_join(sols_ibd,
by = c("subject_id", "episode_number")) %>%
# create bins for IBD r
mutate(ibd_range = cut(estimate,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1"))) %>%
# merge with joint probabilities
left_join(joint_recurrence_summary %>%
rename("joint_classification" = "state_classification"),
by = c("subject_id", "episode_number"))
classification_summary
## subject_id episode_number posterior_probability_recrudescence
## 1 AR-024 2 0.0000000
## 2 AR-024 3 0.7576302
## 3 AR-024 4 0.8623076
## 4 AR-024 5 0.0000000
## 5 AR-221 2 0.0000000
## 6 AR-343 2 0.0000000
## 7 AR-366 2 0.0000000
## 8 AR-187 2 0.7721759
## 9 AR-187 3 0.7721759
## 10 AR-187 4 0.0000000
## 11 AR-383 2 0.0000000
## 12 AR-288 2 0.0000000
## 13 AR-234 2 0.0000000
## 14 AR-306 2 0.0000000
## 15 AR-306 3 0.7913832
## 16 AR-302 2 0.0000000
## 17 AR-341 2 0.0000000
## 18 AR-326 2 0.0000000
## 19 AR-067 2 0.0000000
## 20 AR-322 2 0.0000000
## 21 AR-355 2 0.0000000
## 22 AR-081 2 0.0000000
## 23 AR-081 3 0.0000000
## 24 AR-248 2 0.0000000
## 25 AR-304 2 0.0000000
## 26 AR-160 2 0.7485703
## 27 AR-110 2 0.7479272
## 28 AR-133 2 0.8615406
## 29 AR-133 3 0.0000000
## 30 AR-140 2 0.0000000
## 31 AR-140 3 0.0000000
## 32 AR-142 2 0.0000000
## 33 AR-142 3 0.0000000
## 34 AR-142 4 0.0000000
## 35 AR-173 2 0.0000000
## 36 AR-328 2 0.0000000
## 37 AR-328 3 0.0000000
## 38 AR-325 2 0.0000000
## 39 AR-246 2 0.0000000
## 40 AR-250 2 0.6878086
## 41 AR-258 2 0.0000000
## 42 AR-098 2 0.0000000
## 43 AR-176 2 0.0000000
## 44 AR-176 3 0.0000000
## 45 AR-192 2 0.6883825
## 46 AR-214 2 0.0000000
## 47 AR-237 2 0.6339993
## 48 AR-274 2 0.0000000
## 49 AR-300 2 0.0000000
## 50 AR-345 2 0.0000000
## 51 AR-346 2 0.8462352
## 52 AR-348 2 0.7479272
## 53 AR-382 2 0.7451339
## posterior_probability_relapse posterior_probability_reinfection sample_id
## 1 0.20029039 7.997096e-01 AR-024-56
## 2 0.24236975 1.780460e-08 AR-024-70
## 3 0.13769233 2.870620e-08 AR-024-84
## 4 0.04708870 9.529113e-01 AR-024-112
## 5 0.99999919 8.095861e-07 AR-221-70
## 6 0.09761816 9.023818e-01 AR-343-112
## 7 0.49131418 5.086858e-01 AR-366-168
## 8 0.22782070 3.361901e-06 AR-187-42
## 9 0.22782283 1.232510e-06 AR-187-84
## 10 0.15668331 8.433167e-01 AR-187-168
## 11 0.26708459 7.329154e-01 AR-383-168
## 12 0.09302359 9.069764e-01 AR-288-70
## 13 0.95411195 4.588805e-02 AR-234-112
## 14 0.10514624 8.948538e-01 AR-306-84
## 15 0.20861539 1.457631e-06 AR-306-85
## 16 0.62091008 3.790899e-01 AR-302-70
## 17 0.05713046 9.428695e-01 AR-341-84
## 18 0.89692628 1.030737e-01 AR-326-168
## 19 0.99712108 2.878920e-03 AR-067-42
## 20 0.20073604 7.992640e-01 AR-322-140
## 21 0.18443518 8.155648e-01 AR-355-140
## 22 0.34465510 6.553449e-01 AR-081-140
## 23 0.99999318 6.820487e-06 AR-081-168
## 24 0.99999371 6.292255e-06 AR-248-84
## 25 0.05497134 9.450287e-01 AR-304-28
## 26 0.25142969 2.054618e-08 AR-160-84
## 27 0.25207193 8.834381e-07 AR-110-84
## 28 0.13805319 4.061912e-04 AR-133-84
## 29 0.99995771 4.228845e-05 AR-133-140
## 30 0.31585812 6.841419e-01 AR-140-56
## 31 0.99847214 1.527856e-03 AR-140-70
## 32 0.09188394 9.081161e-01 AR-142-42
## 33 0.87083346 1.291665e-01 AR-142-46
## 34 0.99922528 7.747186e-04 AR-142-97
## 35 0.85959432 1.404057e-01 AR-173-28
## 36 0.22064240 7.793576e-01 AR-328-84
## 37 0.31818593 6.818141e-01 AR-328-112
## 38 0.99994032 5.967860e-05 AR-325-140
## 39 0.26710484 7.328952e-01 AR-246-70
## 40 0.31219012 1.300991e-06 AR-250-56
## 41 0.06281198 9.371880e-01 AR-258-140
## 42 0.99999479 5.206603e-06 AR-098-56
## 43 0.25588975 7.441102e-01 AR-176-28
## 44 0.99999554 4.459706e-06 AR-176-33
## 45 0.31161734 1.483804e-07 AR-192-70
## 46 0.71125930 2.887407e-01 AR-214-64
## 47 0.36517961 8.210690e-04 AR-237-70
## 48 0.09302472 9.069753e-01 AR-274-56
## 49 0.09302334 9.069767e-01 AR-300-84
## 50 0.09507480 9.049252e-01 AR-345-112
## 51 0.15376481 2.089266e-11 AR-346-112
## 52 0.25207193 8.834381e-07 AR-348-84
## 53 0.25485692 9.180472e-06 AR-382-168
## treatment_arm visit_date days_since_enrolment days_since_last_episode
## 1 AL-PQ 2017-12-11 56 56
## 2 AL-PQ 2017-12-25 70 14
## 3 AL-PQ 2018-01-09 85 15
## 4 AL-PQ 2018-02-05 112 27
## 5 AL-PQ 2018-11-15 70 70
## 6 DHP-PQ 2019-05-08 112 112
## 7 DHP-PQ 2019-07-24 168 168
## 8 AL 2018-09-27 43 43
## 9 AL 2018-11-07 84 41
## 10 AL 2019-01-30 168 84
## 11 DHP-PQ 2019-08-07 168 168
## 12 DHP-PQ 2019-01-08 70 70
## 13 DHP-PQ 2019-01-04 112 112
## 14 AL-PQ 2019-02-18 84 84
## 15 AL-PQ 2019-02-19 85 1
## 16 DHP-PQ 2019-01-23 70 70
## 17 DHP-PQ 2019-04-08 84 84
## 18 DHP-PQ 2019-06-26 168 168
## 19 AL 2018-01-22 42 42
## 20 DHP-PQ 2019-05-28 140 140
## 21 AL-PQ 2019-06-13 140 140
## 22 AL-PQ 2018-07-16 140 140
## 23 AL-PQ 2018-08-13 168 28
## 24 AL-PQ 2018-12-22 82 82
## 25 AL 2018-12-17 28 28
## 26 AL 2018-10-16 84 84
## 27 AL-PQ 2018-08-07 84 84
## 28 AL 2018-09-10 84 84
## 29 AL 2018-11-05 140 56
## 30 AL-PQ 2018-08-20 56 56
## 31 AL-PQ 2018-09-03 70 14
## 32 AL 2018-08-09 42 42
## 33 AL 2018-08-13 46 4
## 34 AL 2018-10-03 97 51
## 35 AL 2018-09-03 28 28
## 36 DHP-PQ 2019-04-01 82 82
## 37 DHP-PQ 2019-05-02 113 31
## 38 AL 2019-05-29 140 140
## 39 DHP-PQ 2018-12-05 70 70
## 40 AL 2018-11-26 56 56
## 41 DHP-PQ 2019-02-25 140 140
## 42 AL-PQ 2018-06-14 56 56
## 43 AL 2018-09-05 28 28
## 44 AL 2018-09-10 33 5
## 45 AL-PQ 2018-10-29 70 70
## 46 AL-PQ 2018-11-07 64 64
## 47 AL-PQ 2018-11-26 70 70
## 48 AL-PQ 2018-12-12 56 56
## 49 AL-PQ 2019-02-04 84 84
## 50 AL-PQ 2019-05-08 112 112
## 51 DHP-PQ 2019-05-08 112 112
## 52 AL 2019-04-11 84 84
## 53 AL-PQ 2019-08-06 168 168
## sample date treatment treatment2 prop_same_haps_1minusPNH
## 1 AR-024-56 2017-12-11 AL-PQ PQ 0.3068233
## 2 AR-024-70 2017-12-25 AL-PQ PQ 0.2219938
## 3 AR-024-84 2018-01-09 AL-PQ PQ 0.2414539
## 4 AR-024-112 2018-02-05 AL-PQ PQ 0.1420881
## 5 AR-221-70 2018-11-15 AL-PQ PQ 1.0000000
## 6 AR-343-112 2019-05-08 DHP-PQ PQ 0.4906786
## 7 AR-366-168 2019-07-24 DHP-PQ PQ 0.4117348
## 8 AR-187-42 2018-09-27 AL AL 1.0000000
## 9 AR-187-84 2018-11-07 AL AL 1.0000000
## 10 AR-187-168 2019-01-30 AL AL 0.3852387
## 11 AR-383-168 2019-08-07 DHP-PQ PQ 0.2490546
## 12 AR-288-70 2019-01-08 DHP-PQ PQ 0.3338144
## 13 AR-234-112 2019-01-04 DHP-PQ PQ 0.8279783
## 14 AR-306-84 2019-02-18 AL-PQ PQ 0.7195963
## 15 AR-306-85 2019-02-19 AL-PQ PQ 0.7195963
## 16 AR-302-70 2019-01-23 DHP-PQ PQ 0.8824741
## 17 AR-341-84 2019-04-08 DHP-PQ PQ 0.4394276
## 18 AR-326-168 2019-06-26 DHP-PQ PQ 0.8963978
## 19 AR-067-42 2018-01-22 AL AL 1.0000000
## 20 AR-322-140 2019-05-28 DHP-PQ PQ 0.3955032
## 21 AR-355-140 2019-06-13 AL-PQ PQ 0.3345985
## 22 AR-081-140 2018-07-16 AL-PQ PQ 0.6640732
## 23 AR-081-168 2018-08-13 AL-PQ PQ 0.6640732
## 24 AR-248-84 2018-12-22 AL-PQ PQ 1.0000000
## 25 AR-304-28 2018-12-17 AL AL 0.5245581
## 26 AR-160-84 2018-10-16 AL AL 1.0000000
## 27 AR-110-84 2018-08-07 AL-PQ PQ 1.0000000
## 28 AR-133-84 2018-09-10 AL AL 1.0000000
## 29 AR-133-140 2018-11-05 AL AL 1.0000000
## 30 AR-140-56 2018-08-20 AL-PQ PQ 0.8158586
## 31 AR-140-70 2018-09-03 AL-PQ PQ 1.0000000
## 32 AR-142-42 2018-08-09 AL AL 0.3976700
## 33 AR-142-46 2018-08-13 AL AL 0.4526802
## 34 AR-142-97 2018-10-03 AL AL 0.7944225
## 35 AR-173-28 2018-09-03 AL AL 0.8550821
## 36 AR-328-84 2019-04-01 DHP-PQ PQ 0.4645095
## 37 AR-328-112 2019-05-02 DHP-PQ PQ 0.6741595
## 38 AR-325-140 2019-05-29 AL AL 0.8956478
## 39 AR-246-70 2018-12-05 DHP-PQ PQ 0.2168451
## 40 AR-250-56 2018-11-26 AL AL 1.0000000
## 41 AR-258-140 2019-02-25 DHP-PQ PQ 0.8162594
## 42 AR-098-56 2018-06-14 AL-PQ PQ 1.0000000
## 43 AR-176-28 2018-09-05 AL AL 0.4092557
## 44 AR-176-33 2018-09-10 AL AL 0.3871925
## 45 AR-192-70 2018-10-29 AL-PQ PQ 1.0000000
## 46 AR-214-64 2018-11-07 AL-PQ PQ 0.9085737
## 47 AR-237-70 2018-11-26 AL-PQ PQ 1.0000000
## 48 AR-274-56 2018-12-12 AL-PQ PQ 0.1967818
## 49 AR-300-84 2019-02-04 AL-PQ PQ 0.2981501
## 50 AR-345-112 2019-05-08 AL-PQ PQ 0.3738918
## 51 AR-346-112 2019-05-08 DHP-PQ PQ 1.0000000
## 52 AR-348-84 2019-04-11 AL AL 1.0000000
## 53 AR-382-168 2019-08-06 AL-PQ PQ 1.0000000
## delay_since_prev_ep change_moi max_moi clones pnh_range classification
## 1 56 1 2 poly (0,0.75] Homologous
## 2 70 -1 1 mono (0.75,1] Heterologous
## 3 85 0 1 mono (0.75,1] Heterologous
## 4 112 1 2 poly (0.75,1] Heterologous
## 5 70 1 2 poly (-Inf,0] Homologous
## 6 112 1 3 poly (0,0.75] Homologous
## 7 168 -2 1 mono (0,0.75] Homologous
## 8 43 0 1 mono (-Inf,0] Homologous
## 9 84 0 1 mono (-Inf,0] Homologous
## 10 168 1 2 poly (0,0.75] Homologous
## 11 168 0 1 mono (0.75,1] Heterologous
## 12 70 0 2 poly (0,0.75] Homologous
## 13 112 -1 2 poly (0,0.75] Homologous
## 14 84 0 2 poly (0,0.75] Homologous
## 15 85 -1 1 mono (0,0.75] Homologous
## 16 70 0 2 poly (0,0.75] Homologous
## 17 84 1 3 poly (0,0.75] Homologous
## 18 168 -1 2 poly (0,0.75] Homologous
## 19 42 0 2 poly (-Inf,0] Homologous
## 20 140 1 2 poly (0,0.75] Homologous
## 21 140 1 2 poly (0,0.75] Homologous
## 22 140 0 2 poly (0,0.75] Homologous
## 23 168 1 3 poly (0,0.75] Homologous
## 24 82 0 2 poly (-Inf,0] Homologous
## 25 28 -1 2 poly (0,0.75] Homologous
## 26 84 0 1 mono (-Inf,0] Homologous
## 27 84 0 1 mono (-Inf,0] Homologous
## 28 84 0 1 mono (-Inf,0] Homologous
## 29 140 2 3 poly (-Inf,0] Homologous
## 30 56 1 3 poly (0,0.75] Homologous
## 31 70 -1 2 poly (-Inf,0] Homologous
## 32 42 1 3 poly (0,0.75] Homologous
## 33 46 -2 1 mono (0,0.75] Homologous
## 34 97 1 2 poly (0,0.75] Homologous
## 35 28 -1 1 mono (0,0.75] Homologous
## 36 82 -1 1 mono (0,0.75] Homologous
## 37 113 1 2 poly (0,0.75] Homologous
## 38 140 0 1 mono (0,0.75] Homologous
## 39 70 0 1 mono (0.75,1] Heterologous
## 40 56 -1 1 mono (-Inf,0] Homologous
## 41 140 -1 2 poly (0,0.75] Homologous
## 42 56 1 2 poly (-Inf,0] Homologous
## 43 28 1 2 poly (0,0.75] Homologous
## 44 33 0 2 poly (0,0.75] Homologous
## 45 70 -1 1 mono (-Inf,0] Homologous
## 46 64 -2 2 poly (0,0.75] Homologous
## 47 70 -1 1 mono (-Inf,0] Homologous
## 48 56 0 2 poly (0.75,1] Heterologous
## 49 84 0 2 poly (0,0.75] Homologous
## 50 112 0 2 poly (0,0.75] Homologous
## 51 112 -1 2 poly (-Inf,0] Homologous
## 52 84 0 1 mono (-Inf,0] Homologous
## 53 168 0 1 mono (-Inf,0] Homologous
## .group ibs_range sampleid1 sampleid2 estimate p_value CI_lower
## 1 1 0.25-0.5 AR-024-56 AR-024-0 0.000 5.000000e-01 0.000
## 2 1 0-0.25 AR-024-70 AR-024-0 0.000 5.000000e-01 0.000
## 3 1 0-0.25 AR-024-84 AR-024-0 0.000 5.000000e-01 0.000
## 4 1 0-0.25 AR-024-112 AR-024-0 0.000 5.000000e-01 0.000
## 5 15 0.75-1 AR-221-70 AR-221-0 1.000 6.314730e-08 0.732
## 6 33 0.25-0.5 AR-343-112 AR-343-0 0.000 5.000000e-01 0.000
## 7 39 0.25-0.5 AR-366-168 AR-366-0 0.282 1.139043e-02 0.022
## 8 12 0.75-1 AR-187-42 AR-187-0 1.000 1.236722e-06 0.712
## 9 12 0.75-1 AR-187-84 AR-187-0 1.000 7.824760e-08 0.763
## 10 12 0.25-0.5 AR-187-168 AR-187-0 0.175 2.099312e-01 0.000
## 11 41 0-0.25 AR-383-168 AR-383-0 0.000 5.000000e-01 0.000
## 12 23 0.25-0.5 AR-288-70 AR-288-0 0.000 5.000000e-01 0.000
## 13 16 0.75-1 AR-234-112 AR-234-0 0.669 1.240223e-03 0.212
## 14 27 0.5-0.75 AR-306-84 AR-306-0 0.548 6.480622e-03 0.103
## 15 27 0.5-0.75 AR-306-85 AR-306-0 0.567 3.426794e-03 0.138
## 16 25 0.75-1 AR-302-70 AR-302-0 0.560 3.842267e-02 0.000
## 17 32 0.25-0.5 AR-341-84 AR-341-0 0.000 5.000000e-01 0.000
## 18 30 0.75-1 AR-326-168 AR-326-0 0.725 2.205554e-02 0.022
## 19 2 0.75-1 AR-067-42 AR-067-0 1.000 3.856190e-05 0.631
## 20 28 0.25-0.5 AR-322-140 AR-322-0 0.000 5.000000e-01 0.000
## 21 37 0.25-0.5 AR-355-140 AR-355-0 0.031 4.380358e-01 0.000
## 22 3 0.5-0.75 AR-081-140 AR-081-0 0.000 5.000000e-01 0.000
## 23 3 0.5-0.75 AR-081-168 AR-081-0 0.000 5.000000e-01 0.000
## 24 19 0.75-1 AR-248-84 AR-248-0 1.000 5.548439e-06 0.697
## 25 26 0.5-0.75 AR-304-28 AR-304-0 0.000 5.000000e-01 0.000
## 26 9 0.75-1 AR-160-84 AR-160-0 1.000 8.884332e-10 0.775
## 27 5 0.75-1 AR-110-84 AR-110-0 1.000 2.540399e-08 0.768
## 28 6 0.75-1 AR-133-84 AR-133-0 1.000 3.057878e-06 0.662
## 29 6 0.75-1 AR-133-140 AR-133-0 1.000 4.752026e-07 0.718
## 30 7 0.75-1 AR-140-56 AR-140-0 0.683 3.407867e-02 0.000
## 31 7 0.75-1 AR-140-70 AR-140-0 1.000 1.578781e-04 0.585
## 32 8 0.25-0.5 AR-142-42 AR-142-0 0.000 5.000000e-01 0.000
## 33 8 0.25-0.5 AR-142-46 AR-142-0 0.000 5.000000e-01 0.000
## 34 8 0.75-1 AR-142-97 AR-142-0 0.672 9.258504e-04 0.222
## 35 10 0.75-1 AR-173-28 AR-173-0 0.755 3.153185e-03 0.200
## 36 31 0.25-0.5 AR-328-84 AR-328-0 0.111 2.890700e-01 0.000
## 37 31 0.5-0.75 AR-328-112 AR-328-0 0.374 4.673184e-02 0.000
## 38 29 0.75-1 AR-325-140 AR-325-0 0.863 1.458441e-07 0.501
## 39 18 0-0.25 AR-246-70 AR-246-0 0.000 5.000000e-01 0.000
## 40 20 0.75-1 AR-250-56 AR-250-0 1.000 5.050400e-08 0.764
## 41 21 0.75-1 AR-258-140 AR-258-0 0.659 4.000648e-04 0.218
## 42 4 0.75-1 AR-098-56 AR-098-0 1.000 5.196368e-08 0.755
## 43 11 0.25-0.5 AR-176-28 AR-176-0 0.000 5.000000e-01 0.000
## 44 11 0.25-0.5 AR-176-33 AR-176-0 0.000 5.000000e-01 0.000
## 45 13 0.75-1 AR-192-70 AR-192-0 1.000 8.370809e-09 0.764
## 46 14 0.75-1 AR-214-64 AR-214-0 0.672 7.405563e-03 0.089
## 47 17 0.75-1 AR-237-70 AR-237-0 1.000 3.886609e-04 0.536
## 48 22 0-0.25 AR-274-56 AR-274-0 0.000 5.000000e-01 0.000
## 49 24 0.25-0.5 AR-300-84 AR-300-0 0.090 3.160523e-01 0.000
## 50 34 0.25-0.5 AR-345-112 AR-345-0 0.000 5.000000e-01 0.000
## 51 35 0.75-1 AR-346-112 AR-346-0 1.000 1.784710e-06 0.713
## 52 36 0.75-1 AR-348-84 AR-348-0 1.000 2.540399e-08 0.768
## 53 40 0.75-1 AR-382-168 AR-382-0 1.000 7.855351e-07 0.705
## CI_upper patientid1 patientid2 day_id1 day_id2 comparison_type
## 1 0.341 024 024 56 0 paired
## 2 0.300 024 024 70 0 paired
## 3 0.346 024 024 84 0 paired
## 4 0.251 024 024 112 0 paired
## 5 1.000 221 221 70 0 paired
## 6 0.442 343 343 112 0 paired
## 7 0.635 366 366 168 0 paired
## 8 1.000 187 187 42 0 paired
## 9 1.000 187 187 84 0 paired
## 10 0.656 187 187 168 0 paired
## 11 0.447 383 383 168 0 paired
## 12 0.388 288 288 70 0 paired
## 13 0.938 234 234 112 0 paired
## 14 0.872 306 306 84 0 paired
## 15 0.877 306 306 85 0 paired
## 16 0.963 302 302 70 0 paired
## 17 0.389 341 341 84 0 paired
## 18 0.983 326 326 168 0 paired
## 19 1.000 067 067 42 0 paired
## 20 0.404 322 322 140 0 paired
## 21 0.482 355 355 140 0 paired
## 22 0.570 081 081 140 0 paired
## 23 0.498 081 081 168 0 paired
## 24 1.000 248 248 84 0 paired
## 25 0.474 304 304 28 0 paired
## 26 1.000 160 160 84 0 paired
## 27 1.000 110 110 84 0 paired
## 28 1.000 133 133 84 0 paired
## 29 1.000 133 133 140 0 paired
## 30 0.980 140 140 56 0 paired
## 31 1.000 140 140 70 0 paired
## 32 0.494 142 142 42 0 paired
## 33 0.542 142 142 46 0 paired
## 34 0.939 142 142 97 0 paired
## 35 0.984 173 173 28 0 paired
## 36 0.575 328 328 84 0 paired
## 37 0.807 328 328 112 0 paired
## 38 0.991 325 325 140 0 paired
## 39 0.456 246 246 70 0 paired
## 40 1.000 250 250 56 0 paired
## 41 0.935 258 258 140 0 paired
## 42 1.000 098 098 56 0 paired
## 43 0.489 176 176 28 0 paired
## 44 0.411 176 176 33 0 paired
## 45 1.000 192 192 70 0 paired
## 46 0.978 214 214 64 0 paired
## 47 1.000 237 237 70 0 paired
## 48 0.300 274 274 56 0 paired
## 49 0.547 300 300 84 0 paired
## 50 0.480 345 345 112 0 paired
## 51 1.000 346 346 112 0 paired
## 52 1.000 348 348 84 0 paired
## 53 1.000 382 382 168 0 paired
## pair sig_est ibd_range joint_classification
## 1 AR-024-56 / AR-024-0 not significant 0-0.25 Reinfection
## 2 AR-024-70 / AR-024-0 not significant 0-0.25 Recrudescence
## 3 AR-024-84 / AR-024-0 not significant 0-0.25 Recrudescence
## 4 AR-024-112 / AR-024-0 not significant 0-0.25 Reinfection
## 5 AR-221-70 / AR-221-0 significant 0.75-1 Relapse
## 6 AR-343-112 / AR-343-0 not significant 0-0.25 Reinfection
## 7 AR-366-168 / AR-366-0 significant 0.25-0.5 Reinfection
## 8 AR-187-42 / AR-187-0 significant 0.75-1 Recrudescence
## 9 AR-187-84 / AR-187-0 significant 0.75-1 Recrudescence
## 10 AR-187-168 / AR-187-0 not significant 0-0.25 Reinfection
## 11 AR-383-168 / AR-383-0 not significant 0-0.25 Reinfection
## 12 AR-288-70 / AR-288-0 not significant 0-0.25 Reinfection
## 13 AR-234-112 / AR-234-0 significant 0.5-0.75 Relapse
## 14 AR-306-84 / AR-306-0 significant 0.5-0.75 Reinfection
## 15 AR-306-85 / AR-306-0 significant 0.5-0.75 Recrudescence
## 16 AR-302-70 / AR-302-0 significant 0.5-0.75 Relapse
## 17 AR-341-84 / AR-341-0 not significant 0-0.25 Reinfection
## 18 AR-326-168 / AR-326-0 significant 0.5-0.75 Relapse
## 19 AR-067-42 / AR-067-0 significant 0.75-1 Relapse
## 20 AR-322-140 / AR-322-0 not significant 0-0.25 Reinfection
## 21 AR-355-140 / AR-355-0 not significant 0-0.25 Reinfection
## 22 AR-081-140 / AR-081-0 not significant 0-0.25 Reinfection
## 23 AR-081-168 / AR-081-0 not significant 0-0.25 Relapse
## 24 AR-248-84 / AR-248-0 significant 0.75-1 Relapse
## 25 AR-304-28 / AR-304-0 not significant 0-0.25 Reinfection
## 26 AR-160-84 / AR-160-0 significant 0.75-1 Recrudescence
## 27 AR-110-84 / AR-110-0 significant 0.75-1 Recrudescence
## 28 AR-133-84 / AR-133-0 significant 0.75-1 Recrudescence
## 29 AR-133-140 / AR-133-0 significant 0.75-1 Relapse
## 30 AR-140-56 / AR-140-0 significant 0.5-0.75 Reinfection
## 31 AR-140-70 / AR-140-0 significant 0.75-1 Relapse
## 32 AR-142-42 / AR-142-0 not significant 0-0.25 Reinfection
## 33 AR-142-46 / AR-142-0 not significant 0-0.25 Relapse
## 34 AR-142-97 / AR-142-0 significant 0.5-0.75 Relapse
## 35 AR-173-28 / AR-173-0 significant 0.75-1 Relapse
## 36 AR-328-84 / AR-328-0 not significant 0-0.25 Reinfection
## 37 AR-328-112 / AR-328-0 significant 0.25-0.5 Reinfection
## 38 AR-325-140 / AR-325-0 significant 0.75-1 Relapse
## 39 AR-246-70 / AR-246-0 not significant 0-0.25 Reinfection
## 40 AR-250-56 / AR-250-0 significant 0.75-1 Recrudescence
## 41 AR-258-140 / AR-258-0 significant 0.5-0.75 Reinfection
## 42 AR-098-56 / AR-098-0 significant 0.75-1 Relapse
## 43 AR-176-28 / AR-176-0 not significant 0-0.25 Reinfection
## 44 AR-176-33 / AR-176-0 not significant 0-0.25 Relapse
## 45 AR-192-70 / AR-192-0 significant 0.75-1 Recrudescence
## 46 AR-214-64 / AR-214-0 significant 0.5-0.75 Relapse
## 47 AR-237-70 / AR-237-0 significant 0.75-1 Recrudescence
## 48 AR-274-56 / AR-274-0 not significant 0-0.25 Reinfection
## 49 AR-300-84 / AR-300-0 not significant 0-0.25 Reinfection
## 50 AR-345-112 / AR-345-0 not significant 0-0.25 Reinfection
## 51 AR-346-112 / AR-346-0 significant 0.75-1 Recrudescence
## 52 AR-348-84 / AR-348-0 significant 0.75-1 Recrudescence
## 53 AR-382-168 / AR-382-0 significant 0.75-1 Recrudescence
## joint_probability
## 1 0.5035808
## 2 0.5035808
## 3 0.5035808
## 4 0.5035808
## 5 0.9999992
## 6 0.9023818
## 7 0.5086858
## 8 0.4787060
## 9 0.4787060
## 10 0.4787060
## 11 0.7329154
## 12 0.9069764
## 13 0.9541120
## 14 0.7117713
## 15 0.7117713
## 16 0.6209101
## 17 0.9428695
## 18 0.8969263
## 19 0.9971211
## 20 0.7992640
## 21 0.8155648
## 22 0.6553398
## 23 0.6553398
## 24 0.9999937
## 25 0.9450287
## 26 0.7485703
## 27 0.7479272
## 28 0.8615093
## 29 0.8615093
## 30 0.6831029
## 31 0.6831029
## 32 0.7883076
## 33 0.7883076
## 34 0.7883076
## 35 0.8595943
## 36 0.5402922
## 37 0.5402922
## 38 0.9999403
## 39 0.7328952
## 40 0.6878086
## 41 0.9371880
## 42 0.9999948
## 43 0.7441066
## 44 0.7441066
## 45 0.6883825
## 46 0.7112593
## 47 0.6339993
## 48 0.9069753
## 49 0.9069767
## 50 0.9049252
## 51 0.8462352
## 52 0.7479272
## 53 0.7451339
classification_summary %>%
select(subject_id, sample_id, episode_number, visit_date, days_since_enrolment, days_since_last_episode, starts_with("posterior_"), starts_with("joint"), prop_same_haps_1minusPNH, pnh_range, ibs_range, classification) %>%
pivot_longer(cols = (starts_with("posterior")),
names_to = "posterior_type",
values_to = "posterior_value") %>%
mutate(posterior_range = cut(posterior_value,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1"))) %>%
count(ibs_range, classification, posterior_type, posterior_range)
classification_summary %>%
select(subject_id, sample_id, pair, episode_number, visit_date, days_since_enrolment, days_since_last_episode, starts_with("posterior_"), prop_same_haps_1minusPNH, pnh_range, ibs_range, classification) %>%
pivot_longer(cols = (starts_with("posterior")),
names_to = "posterior_type",
values_to = "posterior_value") %>%
mutate(posterior_range = cut(posterior_value,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1"))) %>%
ggplot(aes(x = reorder(pair, episode_number), y = posterior_value, group = episode_number, fill = posterior_type)) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("turquoise3", "magenta3", "skyblue4"),
labels = c("posterior_probability_relapse" = "Relapse",
"posterior_probability_recrudescence" = "Recrudescence",
"posterior_probability_reinfection" = "Reinfection")) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Sample pair",
y = "Posterior marginal probability",
fill = "") +
theme_bw() +
facet_wrap(~ibs_range, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
### Plot IBS range vs probabilistic classification (joint probability,
highest ranking)
classification_summary %>%
select(subject_id, sample_id, pair, episode_number, visit_date, days_since_enrolment, days_since_last_episode, starts_with("joint"), prop_same_haps_1minusPNH, pnh_range, ibs_range, classification) %>%
ggplot(aes(x = reorder(pair, episode_number), y = joint_probability, group = episode_number, fill = joint_classification)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("Relapse" = "turquoise3",
"Recrudescence" = "skyblue4",
"Reinfection" = "magenta3")) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Sample pair",
y = "Posterior joint probability (highest ranking classification)",
fill = "") +
theme_bw() +
facet_wrap(~ibs_range, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
classification_summary %>%
select(subject_id, sample_id, episode_number, visit_date, days_since_enrolment, days_since_last_episode, starts_with("posterior_"), ibd_range, CI_lower, CI_upper, sig_est) %>%
pivot_longer(cols = (starts_with("posterior")),
names_to = "posterior_type",
values_to = "posterior_value") %>%
mutate(posterior_range = cut(posterior_value,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1"))) %>%
count(ibd_range, sig_est, posterior_type, posterior_range)
ibd_r_data %>%
filter(comparison_type == "paired") %>%
ggplot(aes(x = pair, y = estimate, group = patientid1, color = sig_est)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper)) +
scale_color_manual(values = c("not significant" = "darkgrey",
"significant" = "indianred3")) +
facet_wrap(~patientid1, scales = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
classification_summary %>%
ggplot(aes(x = reorder(sampleid1, days_since_enrolment), y = estimate, group = subject_id, color = sig_est)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper)) +
scale_color_manual(values = c("not significant" = "darkgrey",
"significant" = "indianred3")) +
labs(x = "Sample",
y = "IBD",
fill = "Significance") +
facet_wrap(~subject_id, scales = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
ibd_r_data %>%
filter(comparison_type == "not paired") %>%
ggplot(aes(x = pair, y = estimate, group = patientid1, color = sig_est)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper)) +
scale_color_manual(values = c("not significant" = "darkgrey",
"significant" = "indianred3")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
classification_summary %>%
ggplot(aes(x = days_since_enrolment, y = estimate, group = subject_id, color = sig_est)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = CI_lower, ymax = CI_upper)) +
scale_color_manual(values = c("not significant" = "darkgrey",
"significant" = "indianred3")) +
labs(x = "Days since last episode",
y = "IBD",
fill = "Significance") +
facet_wrap(~subject_id, scales = "free_x") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
classification_summary %>%
select(subject_id, sample_id, pair, episode_number, visit_date, days_since_enrolment, days_since_last_episode, starts_with("posterior_"), ibd_range, CI_lower, CI_upper, sig_est) %>%
pivot_longer(cols = (starts_with("posterior")),
names_to = "posterior_type",
values_to = "posterior_value") %>%
mutate(posterior_range = cut(posterior_value,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1"))) %>%
ggplot(aes(x = reorder(pair, episode_number), y = posterior_value, group = episode_number, fill = posterior_type)) +
geom_bar(stat = "identity", position = "fill") +
scale_fill_manual(values = c("turquoise3", "magenta3", "skyblue4"),
labels = c("posterior_probability_relapse" = "Relapse",
"posterior_probability_recrudescence" = "Recrudescence",
"posterior_probability_reinfection" = "Reinfection")) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Sample pair",
y = "Posterior marginal probability",
fill = "") +
theme_bw() +
facet_wrap(~ibd_range, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
classification_summary %>%
select(subject_id, sample_id, pair, episode_number, visit_date, days_since_enrolment, days_since_last_episode, starts_with("joint"), ibd_range, CI_lower, CI_upper, sig_est) %>%
ggplot(aes(x = reorder(pair, episode_number), y = joint_probability, group = episode_number, fill = joint_classification)) +
geom_bar(stat = "identity") +
scale_fill_manual(values = c("Relapse" = "turquoise3",
"Recrudescence" = "skyblue4",
"Reinfection" = "magenta3")) +
scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0, 0)) +
labs(x = "Sample pair",
y = "Posterior joint probability (highest ranking classification)",
fill = "") +
theme_bw() +
facet_wrap(~ibd_range, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
confusion_data_ibsrange <- classification_summary %>%
select(subject_id, sample_id, episode_number, visit_date, days_since_enrolment, days_since_last_episode, starts_with("posterior_"), starts_with("joint"), prop_same_haps_1minusPNH, pnh_range, ibs_range,
# renaming for clarity
pnh_classification = classification) %>%
pivot_longer(cols = (starts_with("posterior")),
names_to = "marginal_classification",
values_to = "marginal_probability") %>%
mutate(marginal_classification = case_when(marginal_classification == "posterior_probability_recrudescence" ~ "Recrudescence",
marginal_classification == "posterior_probability_relapse" ~ "Relapse",
marginal_classification == "posterior_probability_reinfection" ~ "Reinfection"),
marginal_classification = factor(marginal_classification,
levels = c("Relapse", "Recrudescence", "Reinfection")),
marginal_probability_range = cut(marginal_probability,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1")),
joint_probability_range = cut(joint_probability,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1"))) %>%
select(subject_id, episode_number, days_since_enrolment, ibs_range, pnh_classification, marginal_classification, marginal_probability, marginal_probability_range, joint_classification, joint_probability, joint_probability_range) %>%
# now only keep the highest 'ranking' marginal classification
group_by(subject_id, episode_number) %>%
slice_max(order_by = marginal_probability, n = 1, with_ties = F) %>%
ungroup() %>%
group_by(ibs_range, marginal_classification, joint_classification) %>%
tally() %>%
dcast(ibs_range ~ marginal_classification + joint_classification, value.var = "n", fill = 0)
confusion_data_ibsrange %>%
melt(id.vars = "ibs_range") %>%
ggplot(aes(ibs_range, variable, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
geom_text(aes(label = value)) +
theme_minimal() +
labs(x = "IBS range (1-PNH)",
y = "Marginal vs joint classification",
fill = "Count")
# Create a confusion matrix for each pair
confusion_data_pnhstate <-
classification_summary %>%
select(subject_id, sample_id, episode_number, visit_date, days_since_enrolment, days_since_last_episode, starts_with("posterior_"), starts_with("joint"), prop_same_haps_1minusPNH, pnh_range, ibs_range,
# renaming for clarity
pnh_classification = classification) %>%
pivot_longer(cols = (starts_with("posterior")),
names_to = "marginal_classification",
values_to = "marginal_probability") %>%
mutate(marginal_classification = case_when(marginal_classification == "posterior_probability_recrudescence" ~ "Recrudescence",
marginal_classification == "posterior_probability_relapse" ~ "Relapse",
marginal_classification == "posterior_probability_reinfection" ~ "Reinfection"),
marginal_classification = factor(marginal_classification,
levels = c("Relapse", "Recrudescence", "Reinfection")),
marginal_probability_range = cut(marginal_probability,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1")),
joint_probability_range = cut(joint_probability,
breaks = c(0, 0.25, 0.5, 0.75, 1),
include.lowest = TRUE,
labels = c("0-0.25", "0.25-0.5", "0.5-0.75", "0.75-1")),
# pnh_state = case_when(pnh_classification == "Heterologous" ~ "Reinfection",
# pnh_classification == "Homologous" ~ "Relapse or Recrudescence")
pnh_state = case_when(ibs_range == "0-0.25" | ibs_range == "0.25-0.5" ~ "Reinfection",
(ibs_range == "0.5-0.75" | ibs_range == "0.75-1") ~ "Relapse or Recrudescence")
) %>%
select(subject_id, episode_number, days_since_enrolment, ibs_range, pnh_classification, pnh_state, marginal_classification, marginal_probability, marginal_probability_range, joint_classification, joint_probability, joint_probability_range) %>%
# now only keep the highest 'ranking' marginal classification
group_by(subject_id, episode_number) %>%
slice_max(order_by = marginal_probability, n = 1, with_ties = F) %>%
ungroup() %>%
group_by(pnh_state, marginal_classification, joint_classification) %>%
tally() %>%
dcast(pnh_state ~ marginal_classification + joint_classification, value.var = "n", fill = 0)
confusion_data_pnhstate %>%
melt(id.vars = "pnh_state") %>%
ggplot(aes(pnh_state, variable, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "red") +
geom_text(aes(label = value)) +
theme_minimal() +
labs(x = "PNH classification",
y = "Marginal vs joint classification",
fill = "Count")